Data Visualization#

import pandas as pd
import math
import numpy as np
import numpy as np
import pandas as pd
import shapefile as shp
import matplotlib.pyplot as plt
import seaborn as sns
fips_df = pd.read_csv('data/fips2county.tsv', sep='\t', header='infer', dtype=str, encoding='latin-1')
cancer_df = pd.read_csv('data/cancer_reg.csv', encoding='latin-1')
# add a new column 'Target_div_Income'
cancer_df['Target_div_Income'] = cancer_df['TARGET_deathRate'] / cancer_df['medIncome']

# extract state and county from 'Geography' column and create new columns
cancer_df[['County', 'State']] = cancer_df['Geography'].str.extract(r'(.+), (.+)')
cancer_df['County'] = cancer_df['County'].str.replace(' County', '')

# manually change two county names
cancer_df.loc[166, 'County'] = 'Dona Ana County'
cancer_df.loc[820, 'County'] = 'La Salle Parish'

# merge the dataframes to get the FIPS codes
cancer_df = pd.merge(cancer_df, fips_df,
                     left_on=['County'], right_on=['CountyName'], how='left')

# add a new column 'Target_div_LogIncome'
cancer_df['Target_div_LogIncome'] = cancer_df['TARGET_deathRate'] / \
                                    (cancer_df['medIncome'].apply(lambda x: math.log(x)))

cancer_df
# create a new dataframe for graphdata with columns 'fips' and 'values'
graphdata = pd.DataFrame({'fips': cancer_df['CountyFIPS'],
                          'values': cancer_df['Target_div_LogIncome'],
                          'CountyFIPS': cancer_df['CountyFIPS']})

# create a new dataframe newbieLOG with column 'anomalies'
newbieLOG = graphdata.copy()
newbieLOG['anomalies'] = (newbieLOG['values'] - newbieLOG['values'].mean()) / newbieLOG['values'].std()
newbieLOG['anomalies'] = np.where((newbieLOG['anomalies']) > 1, (newbieLOG['anomalies']), 0)
newbieLOG = newbieLOG[['fips', 'anomalies']]
newbieLOG
fips anomalies
0 53035 0.000000
1 53037 0.000000
2 53039 0.000000
3 16061 0.000000
4 21135 0.000000
... ... ...
14223 48159 0.000000
14224 50011 0.000000
14225 51067 0.000000
14226 53021 0.000000
14227 20061 1.140201

14228 rows × 2 columns

from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

import plotly.express as px

fig = px.choropleth_mapbox(newbieLOG, geojson=counties, locations='fips', color='anomalies',
                           color_continuous_scale="Hot_r",
                           range_color=(0, 4),
                           mapbox_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
cancer_df.columns
Index(['avgAnnCount', 'avgDeathsPerYear', 'TARGET_deathRate', 'incidenceRate',
       'medIncome', 'popEst2015', 'povertyPercent', 'studyPerCap', 'binnedInc',
       'MedianAge', 'MedianAgeMale', 'MedianAgeFemale', 'Geography',
       'AvgHouseholdSize', 'PercentMarried', 'PctNoHS18_24', 'PctHS18_24',
       'PctSomeCol18_24', 'PctBachDeg18_24', 'PctHS25_Over',
       'PctBachDeg25_Over', 'PctEmployed16_Over', 'PctUnemployed16_Over',
       'PctPrivateCoverage', 'PctPrivateCoverageAlone', 'PctEmpPrivCoverage',
       'PctPublicCoverage', 'PctPublicCoverageAlone', 'PctWhite', 'PctBlack',
       'PctAsian', 'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate',
       'Target_div_Income', 'County', 'State', 'StateFIPS', 'CountyFIPS_3',
       'CountyName', 'StateName', 'CountyFIPS', 'StateAbbr', 'STATE_COUNTY',
       'Target_div_LogIncome'],
      dtype='object')
import pandas as pd
import pandas_bokeh
import matplotlib.pyplot as plt
import pgeocode
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
pandas_bokeh.output_notebook()
import plotly.graph_objects as go

df_race = cancer_df[['TARGET_deathRate', 'medIncome', 'PctWhite', 'PctBlack', 'PctAsian', 'PctOtherRace']]
df_race = df_race.melt(id_vars=['TARGET_deathRate', 'medIncome'], var_name='variable')

def get_variable_group(variable):
    if variable == 'pctwhite':
        return 'White'
    elif variable == 'pctblack':
        return 'Black'
    elif variable == 'pctasian':
        return 'Asian'
    else:
        return 'Other'

df_race['variable_group'] = df_race['variable'].apply(get_variable_group)

colors = ['red', 'blue', 'green', 'purple']
color_map = dict(zip(df_race['variable_group'].unique(), colors))

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
for ax, variable in zip(axes.flatten(), df_race['variable'].unique()):
    data = df_race[df_race['variable'] == variable]
    ax.scatter(data['value'], data['TARGET_deathRate'], c=data['variable_group'].apply(lambda x: color_map[x]), s=data['medIncome']/5000, alpha=0.7)
    ax.set_xlabel('Percentage of population by race')
    ax.set_ylabel('Target death rate')
    ax.set_title(variable)
    ax.set_ylim([100, 400])
plt.suptitle('Impact of race and income on target death rate', fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.92)
plt.show()
Loading BokehJS ...
_images/de2e3afc3708a87aca54811d3bc4e97ecf7b27b76a263272804a6aaba6b840e3.png